library(plyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(dplyr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
Load indeed dataset from github project repository.
Removing the radius column from read_indeed_url
read_indeed_url<-read_indeed_url[,-5]
head(read_indeed_url)
Rename the columns in read_indeed_url
names(read_indeed_url)<-c('Source', 'Job Title','Skills','City','url','Count')
head(read_indeed_url)
Removing NA value in the read_indeed_url data frame
read_indeed_url<-na.omit(read_indeed_url)
head(read_indeed_url)
Sorting the ny_indeed data frame by count
ny_indeed <- ny_indeed %>% arrange(desc(ny_indeed$count))
## Warning: package 'bindrcpp' was built under R version 3.4.4
indeed_skillaggr<-aggregate(read_indeed_url$Count,by=list(Category=read_indeed_url$Skills), FUN=sum)
indeed_skillaggr
Jobs by skills
skills_count<-read_indeed_url %>%
group_by(Skills) %>%
summarise(Total=sum(Count)) %>%
arrange(desc(Total))
skills_count
Jobs opening by City
skills_city<-read_indeed_url %>%
group_by(Skills,City) %>%
summarise(Total=sum(Count)) %>%
arrange(desc(Total))
skills_city
Grouping ny_indeed dataset by type
grpd <- ny_indeed %>%
group_by(type) %>%
select(type,count) %>%
summarise(sum_by_type = sum(count))
plots_top<-tail(skills_count,10)
#ggplot(plots_top, aes(plots_top$Skills, plots_top$Total)) + geom_bar(stat="identity")
darkcols <- brewer.pal(8,"Dark2")
names <- plots_top$Skills
barplot(plots_top$Total,main="Indeed Counts", horiz=TRUE, names.arg=names, las=1, col=darkcols, cex.axis=0.5, cex.names = 0.5)
top10_skills<-skills_city[1:10,]
ggplot(top10_skills, aes(x=Skills, y=Total, colour= City, size = Total)) + geom_point()
library(wordcloud)
wordcloud(skills_count$Skills,skills_count$Total, random.order=FALSE, colors=brewer.pal(8,"Dark2"))
## Warning in wordcloud(skills_count$Skills, skills_count$Total, random.order
## = FALSE, : Machine Learning could not be fit on page. It will not be
## plotted.
Drilling down on the Data Scientist jobs in NY. Lets look at a horizontal bar chart of all skills with type indicated by the bar’s color.
ny_indeed$key_words <- factor(ny_indeed$key_words, levels = unique(ny_indeed$key_words)[order(ny_indeed$count, decreasing = F)])
m <- list(
l = 100,
r = 100,
b = 100,
t = 100,
pad = 4
)
key_word_plot <- plot_ly(data = ny_indeed, x= ~count, y = ~key_words, type = 'bar', orientation = 'h', color = ~type) %>%
layout(title='Skills Required of Data Scientists in NY')
key_word_plot
Now lets look at which type of skill was mentioned the most in job descriptions by plotting the aggregated data.
grpd$type <- factor(grpd$type, levels = unique(grpd$type)[order(grpd$sum_by_type, decreasing = F)])
sum_by_type <- plot_ly(data = grpd, x=~sum_by_type, y=~type, type = 'bar', orientation = 'h', color = ~type) %>%
layout(title='NY Skills by Type')
sum_by_type
# Step4 Conclusion
The top 5 hard skills are oriented towards Big Data, Python, R are the word most used in the dataset. Our finding show a few skills underlie the field of data science. The importance of an education in Mathematics is clear by the number of mentions in the job descriptions of Data Scientist. We cannot conclude definitely about on demand skills since some other skills are missing in the dataset.